import { z } from "zod";
import {
  BaseLLMJudgeMetric,
  LLMJudgeModelSettings,
} from "../BaseLLMJudgeMetric";
import { EvaluationScoreResult } from "@/evaluation/types";
import type { SupportedModelId } from "@/evaluation/models/providerDetection";
import type { LanguageModel } from "ai";
import type { OpikBaseModel } from "@/evaluation/models/OpikBaseModel";
import { generateQuery } from "./template";
import { parseModelOutput } from "./parser";

const validationSchema = z.object({
  input: z.string(),
  output: z.string(),
});

type Input = z.infer<typeof validationSchema>;

/**
 * Response schema for structured output from the LLM.
 */
const responseSchema = z.object({
  score: z.number(),
  reason: z.string(),
});

/**
 * Usefulness metric - evaluates the usefulness of an AI response using an LLM judge.
 *
 * This metric uses a language model to assess how useful an output is given an input.
 * It returns a score between 0.0 and 1.0, where higher values indicate higher usefulness.
 *
 * The evaluation considers:
 * - Helpfulness: How well it solves the user's problem
 * - Relevance: How well it addresses the specific question
 * - Accuracy: Whether the information is correct and reliable
 * - Depth: Whether it provides sufficient detail and explanation
 * - Creativity: Whether it offers innovative or insightful perspectives
 * - Level of detail: Whether the amount of detail is appropriate
 *
 * @example
 * ```typescript
 * import { Usefulness } from 'opik/evaluation/metrics';
 *
 * // Using default model (gpt-4o)
 * const metric = new Usefulness();
 * const result = await metric.score({
 *   input: "What's the capital of France?",
 *   output: "The capital of France is Paris."
 * });
 * console.log(result.value);  // A float between 0.0 and 1.0
 * console.log(result.reason); // Explanation for the score
 *
 * // Using custom model with temperature and seed
 * const customMetric = new Usefulness({
 *   model: 'gpt-4-turbo',
 *   temperature: 0.7,
 *   seed: 42
 * });
 *
 * // Using custom model instance
 * import { openai } from '@ai-sdk/openai';
 * const customModel = openai('gpt-4o');
 * const instanceMetric = new Usefulness({ model: customModel });
 *
 * // With advanced settings
 * const advancedMetric = new Usefulness({
 *   temperature: 0.5,
 *   maxTokens: 1000,
 *   modelSettings: {
 *     topP: 0.9,
 *     presencePenalty: 0.1
 *   }
 * });
 * ```
 */
export class Usefulness extends BaseLLMJudgeMetric {
  /**
   * Creates a new Usefulness metric.
   *
   * @param options - Configuration options
   * @param options.model - The language model to use. Can be a string (model ID), LanguageModel instance, or OpikBaseModel instance. Defaults to 'gpt-4o'.
   * @param options.name - The name of the metric. Defaults to "usefulness_metric".
   * @param options.trackMetric - Whether to track the metric. Defaults to true.
   * @param options.temperature - Temperature setting (0.0-2.0). Controls randomness. Lower values make output more focused and deterministic. See https://ai-sdk.dev/docs/reference/ai-sdk-core/generate-text#temperature
   * @param options.seed - Random seed for reproducible outputs. Useful for testing and debugging.
   * @param options.maxTokens - Maximum number of tokens to generate in the response.
   * @param options.modelSettings - Advanced model settings (topP, topK, presencePenalty, frequencyPenalty, stopSequences)
   */
  constructor(options?: {
    model?: SupportedModelId | LanguageModel | OpikBaseModel;
    name?: string;
    trackMetric?: boolean;
    temperature?: number;
    seed?: number;
    maxTokens?: number;
    modelSettings?: LLMJudgeModelSettings;
  }) {
    const name = options?.name ?? "usefulness_metric";

    super(name, {
      model: options?.model,
      trackMetric: options?.trackMetric,
      temperature: options?.temperature,
      seed: options?.seed,
      maxTokens: options?.maxTokens,
      modelSettings: options?.modelSettings,
    });
  }

  public readonly validationSchema = validationSchema;

  /**
   * Calculates the usefulness score for the given input-output pair.
   *
   * @param input - Input containing the question and response
   * @param input.input - The input text/question that was given to the AI
   * @param input.output - The output text/response generated by the AI
   * @returns Score result with value (0.0-1.0) and reason
   *
   * @example
   * ```typescript
   * const metric = new Usefulness();
   * const result = await metric.score({
   *   input: "How do I make pancakes?",
   *   output: "Mix flour, eggs, and milk. Cook on a hot griddle until golden."
   * });
   * console.log(result.value);  // e.g., 0.8
   * console.log(result.reason); // "The response provides clear, actionable steps..."
   * ```
   */
  async score(input: Input): Promise<EvaluationScoreResult> {
    const { input: inputText, output } = input;

    const llmQuery = generateQuery(inputText, output);

    const modelOptions = this.buildModelOptions();

    const modelOutput = await this.model.generateString(
      llmQuery,
      responseSchema,
      modelOptions
    );

    return parseModelOutput(modelOutput, this.name);
  }
}
